If you like to eat cereal, do yourself a favor and avoid this dataset at all costs. After seeing these data it will never be the same for me to eat Fruity Pebbles again. Content Fields in the dataset: • Name: Name of cereal • mfr: Manufacturer of cereal ○ A = American Home Food Products; ○ G = General Mills ○ K = Kelloggs ○ N = Nabisco ○ P = Post ○ Q = Quaker Oats ○ R = Ralston Purina • type: ○ cold ○ hot • calories: calories per serving • protein: grams of protein • fat: grams of fat • sodium: milligrams of sodium • fiber: grams of dietary fiber • carbo: grams of complex carbohydrates • sugars: grams of sugars • potass: milligrams of potassium • vitamins: vitamins and minerals - 0, 25, or 100, indicating the typical percentage of FDA recommended • shelf: display shelf (1, 2, or 3, counting from the floor) • weight: weight in ounces of one serving • cups: number of cups in one serving • rating: a rating of the cereals (Possibly from Consumer Reports?)
We will use the tidyverse library for importing and wrangling the data.
# Change column names
data_cereal <- raw_data
colnames(data_cereal) <- c("Name", "Manufacturer", "Type", "Calories", "Protein", "Fat", "Sodium", "Fibre", "Carbohydrates", "Sugar", "Potassium", "Vitamins", "Shelf", "Weight", "Cups", "Rating")
# Create feature with full manufacturer name
data_cereal$Manufacturer_Name <- data_cereal$Manufacturer
data_cereal$Manufacturer_Name <- gsub(pattern = "P", replacement = "Post", x = data_cereal$Manufacturer_Name)
data_cereal$Manufacturer_Name <- gsub(pattern = "A", replacement = "American Home Food Products", x = data_cereal$Manufacturer_Name)
data_cereal$Manufacturer_Name <- gsub(pattern = "G", replacement = "General Mills", x = data_cereal$Manufacturer_Name)
data_cereal$Manufacturer_Name <- gsub(pattern = "K", replacement = "Kellogs", x = data_cereal$Manufacturer_Name)
data_cereal$Manufacturer_Name <- gsub(pattern = "N", replacement = "Nabisco", x = data_cereal$Manufacturer_Name)
data_cereal$Manufacturer_Name <- gsub(pattern = "Q", replacement = "Quaker Oats", x = data_cereal$Manufacturer_Name)
data_cereal$Manufacturer_Name <- gsub(pattern = "R", replacement = "Ralston Purina", x = data_cereal$Manufacturer_Name)
data_cereal$Manufacturer_Name## [1] "Nabisco" "Quaker Oats"
## [3] "Kellogs" "Kellogs"
## [5] "Ralston Purina" "General Mills"
## [7] "Kellogs" "General Mills"
## [9] "Ralston Purina" "Post"
## [11] "Quaker Oats" "General Mills"
## [13] "General Mills" "General Mills"
## [15] "General Mills" "Ralston Purina"
## [17] "Kellogs" "Kellogs"
## [19] "General Mills" "Kellogs"
## [21] "Nabisco" "Kellogs"
## [23] "General Mills" "Ralston Purina"
## [25] "Kellogs" "Kellogs"
## [27] "Kellogs" "Post"
## [29] "Kellogs" "Post"
## [31] "Post" "General Mills"
## [33] "Post" "Post"
## [35] "Post" "Quaker Oats"
## [37] "General Mills" "Post"
## [39] "Kellogs" "Kellogs"
## [41] "General Mills" "Quaker Oats"
## [43] "General Mills" "American Home Food Products"
## [45] "Ralston Purina" "Ralston Purina"
## [47] "Kellogs" "General Mills"
## [49] "Kellogs" "Kellogs"
## [51] "Kellogs" "General Mills"
## [53] "Post" "Kellogs"
## [55] "Quaker Oats" "Quaker Oats"
## [57] "Quaker Oats" "Quaker Oats"
## [59] "Kellogs" "General Mills"
## [61] "Kellogs" "Ralston Purina"
## [63] "Kellogs" "Nabisco"
## [65] "Nabisco" "Nabisco"
## [67] "Kellogs" "Kellogs"
## [69] "Nabisco" "General Mills"
## [71] "General Mills" "General Mills"
## [73] "General Mills" "General Mills"
## [75] "Ralston Purina" "General Mills"
## [77] "General Mills"
# Replace 'H' and 'C' in Type with Hot and Cold
data_cereal$Type <- gsub("H", "Hot", x = data_cereal$Type)
data_cereal$Type <- gsub("C", "Cold", x = data_cereal$Type)
# Change cereal type and shelf from character to factor
data_cereal$Type <- factor(data_cereal$Type)
data_cereal$Shelf <- factor(data_cereal$Shelf)
data_cereal$Manufacturer <- factor(data_cereal$Manufacturer)
sapply(data_cereal, FUN = class)## Name Manufacturer Type Calories
## "character" "factor" "factor" "numeric"
## Protein Fat Sodium Fibre
## "numeric" "numeric" "numeric" "numeric"
## Carbohydrates Sugar Potassium Vitamins
## "numeric" "numeric" "numeric" "numeric"
## Shelf Weight Cups Rating
## "factor" "numeric" "numeric" "numeric"
## Manufacturer_Name
## "character"
## Name Manufacturer Type Calories Protein
## Length:77 A: 1 Cold:74 Min. : 50.0 Min. :1.000
## Class :character G:22 Hot : 3 1st Qu.:100.0 1st Qu.:2.000
## Mode :character K:23 Median :110.0 Median :3.000
## N: 6 Mean :106.9 Mean :2.545
## P: 9 3rd Qu.:110.0 3rd Qu.:3.000
## Q: 8 Max. :160.0 Max. :6.000
## R: 8
## Fat Sodium Fibre Carbohydrates
## Min. :0.000 Min. : 0.0 Min. : 0.000 Min. :-1.0
## 1st Qu.:0.000 1st Qu.:130.0 1st Qu.: 1.000 1st Qu.:12.0
## Median :1.000 Median :180.0 Median : 2.000 Median :14.0
## Mean :1.013 Mean :159.7 Mean : 2.152 Mean :14.6
## 3rd Qu.:2.000 3rd Qu.:210.0 3rd Qu.: 3.000 3rd Qu.:17.0
## Max. :5.000 Max. :320.0 Max. :14.000 Max. :23.0
##
## Sugar Potassium Vitamins Shelf Weight
## Min. :-1.000 Min. : -1.00 Min. : 0.00 1:20 Min. :0.50
## 1st Qu.: 3.000 1st Qu.: 40.00 1st Qu.: 25.00 2:21 1st Qu.:1.00
## Median : 7.000 Median : 90.00 Median : 25.00 3:36 Median :1.00
## Mean : 6.922 Mean : 96.08 Mean : 28.25 Mean :1.03
## 3rd Qu.:11.000 3rd Qu.:120.00 3rd Qu.: 25.00 3rd Qu.:1.00
## Max. :15.000 Max. :330.00 Max. :100.00 Max. :1.50
##
## Cups Rating Manufacturer_Name
## Min. :0.250 Min. :18.04 Length:77
## 1st Qu.:0.670 1st Qu.:33.17 Class :character
## Median :0.750 Median :40.40 Mode :character
## Mean :0.821 Mean :42.67
## 3rd Qu.:1.000 3rd Qu.:50.83
## Max. :1.500 Max. :93.70
##
Carbohydrates, sugars and Potassium have some negative values. Since this is not possible we can replace negative values with NA.
# Replace negative values with NA
data_cereal$Carbohydrates[data_cereal$Carbohydrates < 0] <- NA
data_cereal$Sugar[data_cereal$Sugar < 0] <- NA
data_cereal$Potassium[data_cereal$Potassium < 0] <- NA
summary(data_cereal)## Name Manufacturer Type Calories Protein
## Length:77 A: 1 Cold:74 Min. : 50.0 Min. :1.000
## Class :character G:22 Hot : 3 1st Qu.:100.0 1st Qu.:2.000
## Mode :character K:23 Median :110.0 Median :3.000
## N: 6 Mean :106.9 Mean :2.545
## P: 9 3rd Qu.:110.0 3rd Qu.:3.000
## Q: 8 Max. :160.0 Max. :6.000
## R: 8
## Fat Sodium Fibre Carbohydrates
## Min. :0.000 Min. : 0.0 Min. : 0.000 Min. : 5.0
## 1st Qu.:0.000 1st Qu.:130.0 1st Qu.: 1.000 1st Qu.:12.0
## Median :1.000 Median :180.0 Median : 2.000 Median :14.5
## Mean :1.013 Mean :159.7 Mean : 2.152 Mean :14.8
## 3rd Qu.:2.000 3rd Qu.:210.0 3rd Qu.: 3.000 3rd Qu.:17.0
## Max. :5.000 Max. :320.0 Max. :14.000 Max. :23.0
## NA's :1
## Sugar Potassium Vitamins Shelf Weight
## Min. : 0.000 Min. : 15.00 Min. : 0.00 1:20 Min. :0.50
## 1st Qu.: 3.000 1st Qu.: 42.50 1st Qu.: 25.00 2:21 1st Qu.:1.00
## Median : 7.000 Median : 90.00 Median : 25.00 3:36 Median :1.00
## Mean : 7.026 Mean : 98.67 Mean : 28.25 Mean :1.03
## 3rd Qu.:11.000 3rd Qu.:120.00 3rd Qu.: 25.00 3rd Qu.:1.00
## Max. :15.000 Max. :330.00 Max. :100.00 Max. :1.50
## NA's :1 NA's :2
## Cups Rating Manufacturer_Name
## Min. :0.250 Min. :18.04 Length:77
## 1st Qu.:0.670 1st Qu.:33.17 Class :character
## Median :0.750 Median :40.40 Mode :character
## Mean :0.821 Mean :42.67
## 3rd Qu.:1.000 3rd Qu.:50.83
## Max. :1.500 Max. :93.70
##
library(tidyverse)
Manufacturers_Total <- data_cereal %>%
select(Manufacturer_Name, Type) %>%
group_by(Manufacturer_Name, Type) %>%
summarise(Total = n()) %>%
spread(key = Type, value = Total) %>%
replace_na(replace = list(Manufacturer_Name = 0, Cold = 0, Hot = 0)) %>%
mutate(Total = Cold + Hot) %>%
arrange(desc(Total))
Manufacturers_Total## # A tibble: 7 x 4
## # Groups: Manufacturer_Name [7]
## Manufacturer_Name Cold Hot Total
## <chr> <dbl> <dbl> <dbl>
## 1 Kellogs 23 0 23
## 2 General Mills 22 0 22
## 3 Post 9 0 9
## 4 Quaker Oats 7 1 8
## 5 Ralston Purina 8 0 8
## 6 Nabisco 5 1 6
## 7 American Home Food Products 0 1 1
ggplot(data_cereal, aes(x = factor(Manufacturer_Name, levels = rev(Manufacturers_Total$Manufacturer_Name)), fill = Type)) +
geom_bar() +
coord_flip(expand = FALSE) +
scale_fill_brewer(palette = "Set1", direction = -1) +
scale_x_discrete(name = "Manufacturer") +
scale_y_continuous(name = "Count", minor_breaks = NULL) +
theme_minimal() +
labs(title = "Number of Products by Manufacturer")data_cereal %>%
ggplot(aes(x = Weight, fill = Manufacturer_Name)) +
geom_histogram() +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Weight (in ounces)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 70)) +
labs(fill = "Manufacturer", title = "Distribution of Weight per Serving", subtitle = "Manufacturers use different weights for servings") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Weight*28.3495, fill = Manufacturer_Name)) +
geom_histogram() +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Weight (g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 70)) +
labs(fill = "Manufacturer", title = "Distribution of Weight per Serving", subtitle = "Manufacturers use different weights for servings") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Cups, fill = Manufacturer_Name)) +
geom_histogram() +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Number of cups in one serving", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 35)) +
labs(fill = "Manufacturer", title = "Distribution of Number of Cups per Serving", subtitle = "Different products have different cup sizes") +
theme_minimal()## Rows: 77
## Columns: 17
## $ Name <chr> "100% Bran", "100% Natural Bran", "All-Bran", "All-…
## $ Manufacturer <fct> N, Q, K, K, R, G, K, G, R, P, Q, G, G, G, G, R, K, …
## $ Type <fct> Cold, Cold, Cold, Cold, Cold, Cold, Cold, Cold, Col…
## $ Calories <dbl> 70, 120, 70, 50, 110, 110, 110, 130, 90, 90, 120, 1…
## $ Protein <dbl> 4, 3, 4, 4, 2, 2, 2, 3, 2, 3, 1, 6, 1, 3, 1, 2, 2, …
## $ Fat <dbl> 1, 5, 1, 0, 2, 2, 0, 2, 1, 0, 2, 2, 3, 2, 1, 0, 0, …
## $ Sodium <dbl> 130, 15, 260, 140, 200, 180, 125, 210, 200, 210, 22…
## $ Fibre <dbl> 10.0, 2.0, 9.0, 14.0, 1.0, 1.5, 1.0, 2.0, 4.0, 5.0,…
## $ Carbohydrates <dbl> 5.0, 8.0, 7.0, 8.0, 14.0, 10.5, 11.0, 18.0, 15.0, 1…
## $ Sugar <dbl> 6, 8, 5, 0, 8, 10, 14, 8, 6, 5, 12, 1, 9, 7, 13, 3,…
## $ Potassium <dbl> 280, 135, 320, 330, NA, 70, 30, 100, 125, 190, 35, …
## $ Vitamins <dbl> 25, 0, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, …
## $ Shelf <fct> 3, 3, 3, 3, 3, 1, 2, 3, 1, 3, 2, 1, 2, 3, 2, 1, 1, …
## $ Weight <dbl> 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.33, 1.0…
## $ Cups <dbl> 0.33, 1.00, 0.33, 0.50, 0.75, 0.75, 1.00, 0.75, 0.6…
## $ Rating <dbl> 68.40297, 33.98368, 59.42551, 93.70491, 34.38484, 2…
## $ Manufacturer_Name <chr> "Nabisco", "Quaker Oats", "Kellogs", "Kellogs", "Ra…
# Add nutritionals per ounce
data_cereal$Calories_oz <- data_cereal$Calories * data_cereal$Weight
data_cereal$Protein_oz <- data_cereal$Protein * data_cereal$Weight
data_cereal$Fat_oz <- data_cereal$Fat * data_cereal$Weight
data_cereal$Sodium_oz <- data_cereal$Sodium * data_cereal$Weight
data_cereal$Fibre_oz <- data_cereal$Fibre * data_cereal$Weight
data_cereal$Carbohydrates_oz <- data_cereal$Carbohydrates * data_cereal$Weight
data_cereal$Sugar_oz <- data_cereal$Sugar * data_cereal$Weight
data_cereal$Potassium_oz <- data_cereal$Potassium * data_cereal$Weight
data_cereal$Vitamins_oz <- data_cereal$Vitamins * data_cereal$Weight
# Add nutritionals per 100g
# 1 oz. = 28.3495g
# 100g = 3.5274 oz.
data_cereal$Calories_100g <- round(data_cereal$Calories_oz * 3.5274, 0)
data_cereal$Protein_100g <- round(data_cereal$Protein_oz * 3.5274, 1)
data_cereal$Fat_100g <- round(data_cereal$Fat_oz * 3.5274, 1)
data_cereal$Sodium_100g <- round(data_cereal$Sodium_oz * 3.5274, 1)
data_cereal$Fibre_100g <- round(data_cereal$Fibre_oz * 3.5274, 1)
data_cereal$Carbohydrates_100g <- round(data_cereal$Carbohydrates_oz * 3.5274, 1)
data_cereal$Sugar_100g <- round(data_cereal$Sugar_oz * 3.5274, 1)
data_cereal$Potassium_100g <- round(data_cereal$Potassium_oz * 3.5274, 1)
data_cereal$Vitamins_100g <- round(data_cereal$Vitamins_oz * 3.5274, 1)
glimpse(data_cereal)## Rows: 77
## Columns: 35
## $ Name <chr> "100% Bran", "100% Natural Bran", "All-Bran", "All…
## $ Manufacturer <fct> N, Q, K, K, R, G, K, G, R, P, Q, G, G, G, G, R, K,…
## $ Type <fct> Cold, Cold, Cold, Cold, Cold, Cold, Cold, Cold, Co…
## $ Calories <dbl> 70, 120, 70, 50, 110, 110, 110, 130, 90, 90, 120, …
## $ Protein <dbl> 4, 3, 4, 4, 2, 2, 2, 3, 2, 3, 1, 6, 1, 3, 1, 2, 2,…
## $ Fat <dbl> 1, 5, 1, 0, 2, 2, 0, 2, 1, 0, 2, 2, 3, 2, 1, 0, 0,…
## $ Sodium <dbl> 130, 15, 260, 140, 200, 180, 125, 210, 200, 210, 2…
## $ Fibre <dbl> 10.0, 2.0, 9.0, 14.0, 1.0, 1.5, 1.0, 2.0, 4.0, 5.0…
## $ Carbohydrates <dbl> 5.0, 8.0, 7.0, 8.0, 14.0, 10.5, 11.0, 18.0, 15.0, …
## $ Sugar <dbl> 6, 8, 5, 0, 8, 10, 14, 8, 6, 5, 12, 1, 9, 7, 13, 3…
## $ Potassium <dbl> 280, 135, 320, 330, NA, 70, 30, 100, 125, 190, 35,…
## $ Vitamins <dbl> 25, 0, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,…
## $ Shelf <fct> 3, 3, 3, 3, 3, 1, 2, 3, 1, 3, 2, 1, 2, 3, 2, 1, 1,…
## $ Weight <dbl> 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.33, 1.…
## $ Cups <dbl> 0.33, 1.00, 0.33, 0.50, 0.75, 0.75, 1.00, 0.75, 0.…
## $ Rating <dbl> 68.40297, 33.98368, 59.42551, 93.70491, 34.38484, …
## $ Manufacturer_Name <chr> "Nabisco", "Quaker Oats", "Kellogs", "Kellogs", "R…
## $ Calories_oz <dbl> 70.0, 120.0, 70.0, 50.0, 110.0, 110.0, 110.0, 172.…
## $ Protein_oz <dbl> 4.00, 3.00, 4.00, 4.00, 2.00, 2.00, 2.00, 3.99, 2.…
## $ Fat_oz <dbl> 1.00, 5.00, 1.00, 0.00, 2.00, 2.00, 0.00, 2.66, 1.…
## $ Sodium_oz <dbl> 130.0, 15.0, 260.0, 140.0, 200.0, 180.0, 125.0, 27…
## $ Fibre_oz <dbl> 10.00, 2.00, 9.00, 14.00, 1.00, 1.50, 1.00, 2.66, …
## $ Carbohydrates_oz <dbl> 5.00, 8.00, 7.00, 8.00, 14.00, 10.50, 11.00, 23.94…
## $ Sugar_oz <dbl> 6.00, 8.00, 5.00, 0.00, 8.00, 10.00, 14.00, 10.64,…
## $ Potassium_oz <dbl> 280, 135, 320, 330, NA, 70, 30, 133, 125, 190, 35,…
## $ Vitamins_oz <dbl> 25.00, 0.00, 25.00, 25.00, 25.00, 25.00, 25.00, 33…
## $ Calories_100g <dbl> 247, 423, 247, 176, 388, 388, 388, 610, 317, 317, …
## $ Protein_100g <dbl> 14.1, 10.6, 14.1, 14.1, 7.1, 7.1, 7.1, 14.1, 7.1, …
## $ Fat_100g <dbl> 3.5, 17.6, 3.5, 0.0, 7.1, 7.1, 0.0, 9.4, 3.5, 0.0,…
## $ Sodium_100g <dbl> 458.6, 52.9, 917.1, 493.8, 705.5, 634.9, 440.9, 98…
## $ Fibre_100g <dbl> 35.3, 7.1, 31.7, 49.4, 3.5, 5.3, 3.5, 9.4, 14.1, 1…
## $ Carbohydrates_100g <dbl> 17.6, 28.2, 24.7, 28.2, 49.4, 37.0, 38.8, 84.4, 52…
## $ Sugar_100g <dbl> 21.2, 28.2, 17.6, 0.0, 28.2, 35.3, 49.4, 37.5, 21.…
## $ Potassium_100g <dbl> 987.7, 476.2, 1128.8, 1164.0, NA, 246.9, 105.8, 46…
## $ Vitamins_100g <dbl> 88.2, 0.0, 88.2, 88.2, 88.2, 88.2, 88.2, 117.3, 88…
## Name Manufacturer Type Calories Protein
## Length:77 A: 1 Cold:74 Min. : 50.0 Min. :1.000
## Class :character G:22 Hot : 3 1st Qu.:100.0 1st Qu.:2.000
## Mode :character K:23 Median :110.0 Median :3.000
## N: 6 Mean :106.9 Mean :2.545
## P: 9 3rd Qu.:110.0 3rd Qu.:3.000
## Q: 8 Max. :160.0 Max. :6.000
## R: 8
## Fat Sodium Fibre Carbohydrates
## Min. :0.000 Min. : 0.0 Min. : 0.000 Min. : 5.0
## 1st Qu.:0.000 1st Qu.:130.0 1st Qu.: 1.000 1st Qu.:12.0
## Median :1.000 Median :180.0 Median : 2.000 Median :14.5
## Mean :1.013 Mean :159.7 Mean : 2.152 Mean :14.8
## 3rd Qu.:2.000 3rd Qu.:210.0 3rd Qu.: 3.000 3rd Qu.:17.0
## Max. :5.000 Max. :320.0 Max. :14.000 Max. :23.0
## NA's :1
## Sugar Potassium Vitamins Shelf Weight
## Min. : 0.000 Min. : 15.00 Min. : 0.00 1:20 Min. :0.50
## 1st Qu.: 3.000 1st Qu.: 42.50 1st Qu.: 25.00 2:21 1st Qu.:1.00
## Median : 7.000 Median : 90.00 Median : 25.00 3:36 Median :1.00
## Mean : 7.026 Mean : 98.67 Mean : 28.25 Mean :1.03
## 3rd Qu.:11.000 3rd Qu.:120.00 3rd Qu.: 25.00 3rd Qu.:1.00
## Max. :15.000 Max. :330.00 Max. :100.00 Max. :1.50
## NA's :1 NA's :2
## Cups Rating Manufacturer_Name Calories_oz
## Min. :0.250 Min. :18.04 Length:77 Min. : 25.0
## 1st Qu.:0.670 1st Qu.:33.17 Class :character 1st Qu.:100.0
## Median :0.750 Median :40.40 Mode :character Median :110.0
## Mean :0.821 Mean :42.67 Mean :112.1
## 3rd Qu.:1.000 3rd Qu.:50.83 3rd Qu.:110.0
## Max. :1.500 Max. :93.70 Max. :240.0
##
## Protein_oz Fat_oz Sodium_oz Fibre_oz
## Min. :0.500 Min. :0.000 Min. : 0.0 Min. : 0.000
## 1st Qu.:2.000 1st Qu.:0.000 1st Qu.:130.0 1st Qu.: 0.500
## Median :3.000 Median :1.000 Median :180.0 Median : 2.000
## Mean :2.656 Mean :1.075 Mean :168.2 Mean : 2.303
## 3rd Qu.:3.750 3rd Qu.:2.000 3rd Qu.:225.0 3rd Qu.: 3.000
## Max. :6.000 Max. :5.000 Max. :320.0 Max. :14.000
##
## Carbohydrates_oz Sugar_oz Potassium_oz Vitamins_oz
## Min. : 5.00 Min. : 0.000 Min. : 7.5 Min. : 0.00
## 1st Qu.:12.00 1st Qu.: 3.000 1st Qu.: 40.0 1st Qu.: 25.00
## Median :15.00 Median : 7.000 Median : 90.0 Median : 25.00
## Mean :15.33 Mean : 7.535 Mean :106.1 Mean : 30.15
## 3rd Qu.:18.16 3rd Qu.:11.175 3rd Qu.:129.0 3rd Qu.: 25.00
## Max. :27.93 Max. :21.000 Max. :345.8 Max. :150.00
## NA's :1 NA's :1 NA's :2
## Calories_100g Protein_100g Fat_100g Sodium_100g
## Min. : 88.0 Min. : 1.800 Min. : 0.00 Min. : 0.0
## 1st Qu.:353.0 1st Qu.: 7.100 1st Qu.: 0.00 1st Qu.: 458.6
## Median :388.0 Median :10.600 Median : 3.50 Median : 634.9
## Mean :395.3 Mean : 9.384 Mean : 3.79 Mean : 593.5
## 3rd Qu.:388.0 3rd Qu.:13.200 3rd Qu.: 7.10 3rd Qu.: 793.7
## Max. :847.0 Max. :21.200 Max. :17.60 Max. :1128.8
##
## Fibre_100g Carbohydrates_100g Sugar_100g Potassium_100g
## Min. : 0.000 Min. :17.60 Min. : 0.00 Min. : 26.5
## 1st Qu.: 1.800 1st Qu.:42.30 1st Qu.:10.60 1st Qu.: 141.1
## Median : 7.100 Median :52.90 Median :24.70 Median : 317.5
## Mean : 8.127 Mean :54.06 Mean :26.58 Mean : 374.3
## 3rd Qu.:10.600 3rd Qu.:64.05 3rd Qu.:39.42 3rd Qu.: 455.0
## Max. :49.400 Max. :98.50 Max. :74.10 Max. :1219.8
## NA's :1 NA's :1 NA's :2
## Vitamins_100g
## Min. : 0.0
## 1st Qu.: 88.2
## Median : 88.2
## Mean :106.3
## 3rd Qu.: 88.2
## Max. :529.1
##
# Scatter Plot Matrix
# install.packages("GGally")
library(GGally)
# Create function to add regression line to scatter plot matrix
sm_regression <- function(data, mapping, ...){
p <- ggplot(data = data, mapping = mapping) +
geom_point(alpha = 0.4) +
geom_smooth(method=lm, fill="grey10", color="grey10", ...)
p
}
data_cereal %>%
select(Manufacturer_Name, Calories_100g, Protein, Fat, Sodium, Fibre, Carbohydrates, Sugar, Potassium, Shelf, Rating) %>%
ggpairs(columns = 2:11, lower = list(continuous = sm_regression)) +
theme_bw()data_cereal %>%
select(Calories = Calories_100g, Protein, Fat, Sodium, Fibre, Carbs = Carbohydrates, Sugar, Potassium, Shelf, Rating) %>%
ggcorr(palette = "RdBu", label = TRUE, label_round = 2)data_cereal %>%
select(Name, Manufacturer_Name, Type, Calories, Calories_oz, Calories_100g) %>%
datatable(rownames = NULL, filter = "top", colnames = c("Product", "Manufacturer", "Type", "Calories (g per serving)", "Calories (g per ounce)", "Calories (g per 100g)"))There seem to be some mistakes in the dataset regarding calorie content. As there are products that have almost no calories (<90 kcal) and products that have close to the maximum amount of calories possible per 100g of product (900 kcal).
Since we know that fat has 9 kcal/g and protein and carbohydrates have 4 kcal/g we will recalculate the calories from the nutritional data available and replot the histogram of calories per 100g.
# Protein: 4 kcal/g
# Carbohydrates: 4 kcal/g
# Fat: 9 kcal/g
# Calories = 9 * Fat + 4 * Protein + 4 * Carbohydrates
data_cereal$Calories_100g_calculated <- data_cereal$Fat_100g * 9 + data_cereal$Protein_100g * 4 + data_cereal$Carbohydrates_100g * 4 # Summary Table of Calories per 100g
data_cereal %>%
select(Manufacturer_Name, Calories_100g_calculated, Type) %>%
group_by(Manufacturer_Name) %>%
summarise(Average = round(mean(Calories_100g_calculated, na.rm = TRUE), 1),
Median = round(median(Calories_100g_calculated, na.rm = TRUE), 1),
Lowest = min(Calories_100g_calculated, na.rm = TRUE),
Highest = max(Calories_100g_calculated, na.rm = TRUE),
Count = n())## # A tibble: 7 x 6
## Manufacturer_Name Average Median Lowest Highest Count
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 American Home Food Products 314. 314. 314. 314. 1
## 2 General Mills 299. 288. 215. 479. 22
## 3 Kellogs 296. 296. 169. 535 23
## 4 Nabisco 264. 275. 158. 339. 6
## 5 Post 265. 282. 184. 344. 9
## 6 Quaker Oats 224. 247. 84.4 314. 8
## 7 Ralston Purina 324. 326. 272. 377. 8
data_cereal %>%
ggplot(aes(x = Manufacturer_Name, y = Calories_100g_calculated, fill = Manufacturer_Name)) +
geom_boxplot(show.legend = FALSE) +
stat_summary(fun.y = mean, geom = "point", pch = 1, show.legend = FALSE) + # Add average to the boxplot
scale_y_continuous(name = "Calories (g per 100g)", minor_breaks = NULL) +
scale_fill_brewer(palette = "Set1") +
coord_flip() +
theme_minimal() +
labs(x = "Manufacturer") +
ggtitle(label = "Distribution of Calorie Content by Manufacturer")data_cereal %>%
ggplot(aes(x = Calories_100g_calculated)) +
geom_density(fill = "grey", alpha = 0.8, adjust = 3, linetype = 0) +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Calories (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Proportion", expand = c(0,0)) +
labs(fill = "Manufacturer", title = "Distribution of Calories in Breakfast Cereals") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Calories_100g_calculated, fill = Manufacturer_Name)) +
geom_histogram() +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Calories (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 10), breaks = seq(0, 10, 2)) +
labs(fill = "Manufacturer", title = "Distribution of Calories in Breakfast Cereals") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Calories_100g_calculated, fill = Type)) +
geom_histogram() +
scale_fill_brewer(palette = "Set1", direction = -1) +
scale_x_continuous(name = "Calories (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 10), breaks = seq(0, 10, 2)) +
labs(fill = "Manufacturer", title = "Distribution of Calories in Breakfast Cereals") +
theme_minimal()data_cereal %>%
select(Name, Manufacturer_Name, Type, Fat, Fat_oz, Fat_100g) %>%
datatable(rownames = NULL, filter = "top", colnames = c("Product", "Manufacturer", "Type", "Fat (g per serving)", "Fat (g per ounce)", "Fat (g per 100g)"))# Summary Table of Fat per 100g
data_cereal %>%
select(Manufacturer_Name, Fat_100g, Type) %>%
group_by(Manufacturer_Name) %>%
summarise(Average = round(mean(Fat_100g, na.rm = TRUE), 1),
Median = round(median(Fat_100g, na.rm = TRUE), 1),
Lowest = min(Fat_100g, na.rm = TRUE),
Highest = max(Fat_100g, na.rm = TRUE),
Count = n())## # A tibble: 7 x 6
## Manufacturer_Name Average Median Lowest Highest Count
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 American Home Food Products 3.5 3.5 3.5 3.5 1
## 2 General Mills 5.1 3.5 3.5 10.6 22
## 3 Kellogs 2.5 0 0 10.6 23
## 4 Nabisco 0.6 0 0 3.5 6
## 5 Post 3.5 3.5 0 10.6 9
## 6 Quaker Oats 6.2 7.1 0 17.6 8
## 7 Ralston Purina 4.4 3.5 0 10.6 8
data_cereal %>%
ggplot(aes(x = Manufacturer_Name, y = Fat_100g, fill = Manufacturer_Name)) +
geom_boxplot(show.legend = FALSE) +
stat_summary(fun.y = mean, geom = "point", pch = 1, show.legend = FALSE) + # Add average to the boxplot
scale_y_continuous(name = "Fat (g per 100g)", minor_breaks = NULL) +
scale_fill_brewer(palette = "Set1") +
coord_flip() +
theme_minimal() +
labs(x = "Manufacturer") +
ggtitle(label = "Distribution of Fat Content by Manufacturer")data_cereal %>%
ggplot(aes(x = Fat_100g)) +
geom_density(fill = "grey", alpha = 0.8, linetype = 0) +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Fat (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Proportion", expand = c(0,0)) +
labs(fill = "Manufacturer", title = "Distribution of Fat in Breakfast Cereals") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Fat_100g, fill = Manufacturer_Name)) +
geom_histogram(alpha = 0.8, bins = 8) +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Fat (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 40), breaks = seq(0, 40, 10)) +
labs(fill = "Manufacturer", title = "Distribution of Fat in Breakfast Cereals") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Fat_100g, fill = Type)) +
geom_histogram(alpha = 0.8, bins = 8) +
scale_fill_brewer(palette = "Set1", direction = -1) +
scale_x_continuous(name = "Fat (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 40), breaks = seq(0, 40, 10)) +
labs(fill = "Type", title = "Distribution of Fat in Breakfast Cereals") +
theme_minimal()data_cereal %>%
select(Name, Manufacturer_Name, Type, Protein, Protein_oz, Protein_100g) %>%
datatable(rownames = NULL, filter = "top", colnames = c("Product", "Manufacturer", "Type", "Protein (g per serving)", "Protein (g per ounce)", "Protein (g per 100g)"))# Summary Table of Protein per 100g
data_cereal %>%
select(Manufacturer_Name, Protein_100g, Type) %>%
group_by(Manufacturer_Name) %>%
summarise(Average = round(mean(Protein_100g, na.rm = TRUE), 1),
Median = round(median(Protein_100g, na.rm = TRUE), 1),
Lowest = min(Protein_100g, na.rm = TRUE),
Highest = max(Protein_100g, na.rm = TRUE),
Count = n())## # A tibble: 7 x 6
## Manufacturer_Name Average Median Lowest Highest Count
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 American Home Food Products 14.1 14.1 14.1 14.1 1
## 2 General Mills 8.7 7.1 3.5 21.2 22
## 3 Kellogs 10.2 10.6 3.5 21.2 23
## 4 Nabisco 9.8 10.6 5.9 14.1 6
## 5 Post 9.3 10.6 3.5 14.1 9
## 6 Quaker Oats 8.6 7 1.8 17.6 8
## 7 Ralston Purina 8.8 7.1 3.5 14.1 8
data_cereal %>%
ggplot(aes(x = Manufacturer_Name, y = Protein_100g, fill = Manufacturer_Name)) +
geom_boxplot(show.legend = FALSE) +
stat_summary(fun.y = mean, geom = "point", pch = 1, show.legend = FALSE) + # Add average to the boxplot
scale_y_continuous(name = "Protein (g per 100g)", minor_breaks = NULL) +
scale_fill_brewer(palette = "Set1") +
coord_flip() +
theme_minimal() +
labs(x = "Manufacturer") +
ggtitle(label = "Distribution of Protein Content by Manufacturer")data_cereal %>%
ggplot(aes(x = Protein_100g)) +
geom_density(fill = "grey", alpha = 0.8, linetype = 0) +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Protein (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Proportion", expand = c(0,0)) +
labs(fill = "Manufacturer", title = "Distribution of Protein in Breakfast Cereals") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Protein_100g, fill = Manufacturer_Name)) +
geom_histogram(alpha = 0.8, bins = 8) +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Protein (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 40), breaks = seq(0, 40, 10)) +
labs(fill = "Manufacturer", title = "Distribution of Protein in Breakfast Cereals") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Protein_100g, fill = Type)) +
geom_histogram(alpha = 0.8, bins = 8) +
scale_fill_brewer(palette = "Set1", direction = -1) +
scale_x_continuous(name = "Protein (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 40), breaks = seq(0, 40, 10)) +
labs(fill = "Type", title = "Distribution of Protein in Breakfast Cereals") +
theme_minimal()data_cereal %>%
select(Name, Manufacturer_Name, Type, Carbohydrates, Carbohydrates_oz, Carbohydrates_100g) %>%
datatable(rownames = NULL, filter = "top", colnames = c("Product", "Manufacturer", "Type", "Carbohydrates (g per serving)", "Carbohydrates (g per ounce)", "Carbohydrates (g per 100g)"))# Summary Table of Carbohydrates per 100g
data_cereal %>%
select(Manufacturer_Name, Carbohydrates_100g, Type) %>%
group_by(Manufacturer_Name) %>%
summarise(Average = round(mean(Carbohydrates_100g, na.rm = TRUE), 1),
Median = round(median(Carbohydrates_100g, na.rm = TRUE), 1),
Lowest = min(Carbohydrates_100g, na.rm = TRUE),
Highest = max(Carbohydrates_100g, na.rm = TRUE),
Count = n())## # A tibble: 7 x 6
## Manufacturer_Name Average Median Lowest Highest Count
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 American Home Food Products 56.4 56.4 56.4 56.4 1
## 2 General Mills 54.6 52.9 37 84.4 22
## 3 Kellogs 58.1 56.4 24.7 98.5 23
## 4 Nabisco 54.8 60 17.6 74.1 6
## 5 Post 49.3 49.4 38.8 60 9
## 6 Quaker Oats 35 42.3 17.6 49.4 8
## 7 Ralston Purina 62.2 58.2 49.4 81.1 8
data_cereal %>%
ggplot(aes(x = Manufacturer_Name, y = Carbohydrates_100g, fill = Manufacturer_Name)) +
geom_boxplot(show.legend = FALSE) +
stat_summary(fun.y = mean, geom = "point", pch = 1, show.legend = FALSE) + # Add average to the boxplot
scale_y_continuous(name = "Carbohydrates (g per 100g)", minor_breaks = NULL) +
scale_fill_brewer(palette = "Set1") +
coord_flip() +
theme_minimal() +
labs(x = "Manufacturer") +
ggtitle(label = "Distribution of Carbohydrates Content by Manufacturer")data_cereal %>%
ggplot(aes(x = Carbohydrates_100g)) +
geom_density(fill = "grey", alpha = 0.8, linetype = 0) +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Carbohydrates (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Proportion", expand = c(0,0)) +
labs(fill = "Manufacturer", title = "Distribution of Carbohydrates in Breakfast Cereals") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Carbohydrates_100g, fill = Manufacturer_Name)) +
geom_histogram(alpha = 0.8, bins = 8) +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Carbohydrates (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 40), breaks = seq(0, 40, 10)) +
labs(fill = "Manufacturer", title = "Distribution of Carbohydrates in Breakfast Cereals") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Carbohydrates_100g, fill = Type)) +
geom_histogram(alpha = 0.8, bins = 8) +
scale_fill_brewer(palette = "Set1", direction = -1) +
scale_x_continuous(name = "Carbohydrates (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 40), breaks = seq(0, 40, 10)) +
labs(fill = "Type", title = "Distribution of Carbohydrates in Breakfast Cereals") +
theme_minimal()data_cereal %>%
select(Name, Manufacturer_Name, Type, Sugar, Sugar_oz, Sugar_100g) %>%
datatable(rownames = NULL, filter = "top", colnames = c("Product", "Manufacturer", "Type", "Sugar (g per serving)", "Sugar (g per ounce)", "Sugar (g per 100g)"))# Summary Table of Sugar per 100g
data_cereal %>%
select(Manufacturer_Name, Sugar_100g, Type) %>%
group_by(Manufacturer_Name) %>%
summarise(Average = round(mean(Sugar_100g, na.rm = TRUE), 1),
Median = round(median(Sugar_100g, na.rm = TRUE), 1),
Lowest = min(Sugar_100g, na.rm = TRUE),
Highest = max(Sugar_100g, na.rm = TRUE),
Count = n())## # A tibble: 7 x 6
## Manufacturer_Name Average Median Lowest Highest Count
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 American Home Food Products 10.6 10.6 10.6 10.6 1
## 2 General Mills 30 31.7 3.5 74.1 22
## 3 Kellogs 29.7 24.7 0 68.8 23
## 4 Nabisco 6.5 0 0 21.2 6
## 5 Post 33.7 38.8 10.6 65.7 9
## 6 Quaker Oats 21.7 21.2 0 42.3 8
## 7 Ralston Purina 21.6 19.4 7.1 38.8 8
data_cereal %>%
ggplot(aes(x = Manufacturer_Name, y = Sugar_100g, fill = Manufacturer_Name)) +
geom_boxplot(show.legend = FALSE) +
stat_summary(fun.y = mean, geom = "point", pch = 1, show.legend = FALSE) + # Add average to the boxplot
scale_y_continuous(name = "Sugar (g per 100g)", minor_breaks = NULL) +
scale_fill_brewer(palette = "Set1") +
coord_flip() +
theme_minimal() +
labs(x = "Manufacturer") +
ggtitle(label = "Distribution of Sugar Content by Manufacturer")data_cereal %>%
ggplot(aes(x = Sugar_100g)) +
geom_density(fill = "grey", alpha = 0.8, linetype = 0) +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Sugar (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Proportion", expand = c(0,0)) +
labs(fill = "Manufacturer", title = "Distribution of Sugar in Breakfast Cereals") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Sugar_100g, fill = Manufacturer_Name)) +
geom_histogram(alpha = 0.8, bins = 8) +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Sugar (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 40), breaks = seq(0, 40, 10)) +
labs(fill = "Manufacturer", title = "Distribution of Sugar in Breakfast Cereals") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Sugar_100g, fill = Type)) +
geom_histogram(alpha = 0.8, bins = 8) +
scale_fill_brewer(palette = "Set1", direction = -1) +
scale_x_continuous(name = "Sugar (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 40), breaks = seq(0, 40, 10)) +
labs(fill = "Type", title = "Distribution of Sugar in Breakfast Cereals") +
theme_minimal()data_cereal %>%
select(Name, Manufacturer_Name, Type, Fibre, Fibre_oz, Fibre_100g) %>%
datatable(rownames = NULL, filter = "top", colnames = c("Product", "Manufacturer", "Type", "Fibre (g per serving)", "Fibre (g per ounce)", "Fibre (g per 100g)"))# Summary Table of Fibre per 100g
data_cereal %>%
select(Manufacturer_Name, Fibre_100g, Type) %>%
group_by(Manufacturer_Name) %>%
summarise(Average = round(mean(Fibre_100g, na.rm = TRUE), 1),
Median = round(median(Fibre_100g, na.rm = TRUE), 1),
Lowest = min(Fibre_100g, na.rm = TRUE),
Highest = max(Fibre_100g, na.rm = TRUE),
Count = n())## # A tibble: 7 x 6
## Manufacturer_Name Average Median Lowest Highest Count
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 American Home Food Products 0 0 0 0 1
## 2 General Mills 5 5.3 0 21.2 22
## 3 Kellogs 10.6 3.5 0 49.4 23
## 4 Nabisco 13.8 10.6 3.5 35.3 6
## 5 Post 11.1 10.6 0 28.1 9
## 6 Quaker Oats 4.5 5.3 0 9.5 8
## 7 Ralston Purina 6.6 7 0 14.1 8
data_cereal %>%
ggplot(aes(x = Manufacturer_Name, y = Fibre_100g, fill = Manufacturer_Name)) +
geom_boxplot(show.legend = FALSE) +
stat_summary(fun.y = mean, geom = "point", pch = 1, show.legend = FALSE) + # Add average to the boxplot
scale_y_continuous(name = "Fibre (g per 100g)", minor_breaks = NULL) +
scale_fill_brewer(palette = "Set1") +
coord_flip() +
theme_minimal() +
labs(x = "Manufacturer") +
ggtitle(label = "Distribution of Fibre Content by Manufacturer")data_cereal %>%
ggplot(aes(x = Fibre_100g)) +
geom_density(fill = "grey", alpha = 0.8, linetype = 0) +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Fibre (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Proportion", expand = c(0,0)) +
labs(fill = "Manufacturer", title = "Distribution of Fibre in Breakfast Cereals") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Fibre_100g, fill = Manufacturer_Name)) +
geom_histogram(alpha = 0.8, bins = 8) +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Fibre (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 40), breaks = seq(0, 40, 10)) +
labs(fill = "Manufacturer", title = "Distribution of Fibre in Breakfast Cereals") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Fibre_100g, fill = Type)) +
geom_histogram(alpha = 0.8, bins = 8) +
scale_fill_brewer(palette = "Set1", direction = -1) +
scale_x_continuous(name = "Fibre (g per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 40), breaks = seq(0, 40, 10)) +
labs(fill = "Type", title = "Distribution of Fibre in Breakfast Cereals") +
theme_minimal()data_cereal %>%
select(Name, Manufacturer_Name, Type, Sodium, Sodium_oz, Sodium_100g) %>%
datatable(rownames = NULL, filter = "top", colnames = c("Product", "Manufacturer", "Type", "Sodium (mg per serving)", "Sodium (mg per ounce)", "Sodium (mg per 100g)"))# Summary Table of Sodium per 100g
data_cereal %>%
select(Manufacturer_Name, Sodium_100g, Type) %>%
group_by(Manufacturer_Name) %>%
summarise(Average = round(mean(Sodium_100g, na.rm = TRUE), 1),
Median = round(median(Sodium_100g, na.rm = TRUE), 1),
Lowest = min(Sodium_100g, na.rm = TRUE),
Highest = max(Sodium_100g, na.rm = TRUE),
Count = n())## # A tibble: 7 x 6
## Manufacturer_Name Average Median Lowest Highest Count
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 American Home Food Products 0 0 0 0 1
## 2 General Mills 740. 706. 494. 1023. 22
## 3 Kellogs 670. 706. 0 1129. 23
## 4 Nabisco 132. 26.4 0 459. 6
## 5 Post 557. 600. 159. 938. 9
## 6 Quaker Oats 326. 265. 0 776 8
## 7 Ralston Purina 699. 706. 335. 988. 8
data_cereal %>%
ggplot(aes(x = Manufacturer_Name, y = Sodium_100g, fill = Manufacturer_Name)) +
geom_boxplot(show.legend = FALSE) +
stat_summary(fun.y = mean, geom = "point", pch = 1, show.legend = FALSE) + # Add average to the boxplot
scale_y_continuous(name = "Sodium (mg per 100g)", minor_breaks = NULL) +
scale_fill_brewer(palette = "Set1") +
coord_flip() +
theme_minimal() +
labs(x = "Manufacturer") +
ggtitle(label = "Distribution of Sodium Content by Manufacturer")data_cereal %>%
ggplot(aes(x = Sodium_100g)) +
geom_density(fill = "grey", alpha = 0.8, linetype = 0) +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Sodium (mg per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Proportion", expand = c(0,0)) +
labs(fill = "Manufacturer", title = "Distribution of Sodium in Breakfast Cereals") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Sodium_100g, fill = Manufacturer_Name)) +
geom_histogram(alpha = 0.8, bins = 8) +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Sodium (mg per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 40), breaks = seq(0, 40, 10)) +
labs(fill = "Manufacturer", title = "Distribution of Sodium in Breakfast Cereals") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Sodium_100g, fill = Type)) +
geom_histogram(alpha = 0.8, bins = 8) +
scale_fill_brewer(palette = "Set1", direction = -1) +
scale_x_continuous(name = "Sodium (mg per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 40), breaks = seq(0, 40, 10)) +
labs(fill = "Type", title = "Distribution of Sodium in Breakfast Cereals") +
theme_minimal()data_cereal %>%
select(Name, Manufacturer_Name, Type, Potassium, Potassium_oz, Potassium_100g) %>%
datatable(rownames = NULL, filter = "top", colnames = c("Product", "Manufacturer", "Type", "Potassium (mg per serving)", "Potassium (mg per ounce)", "Potassium (mg per 100g)"))# Summary Table of Potassium per 100g
data_cereal %>%
select(Manufacturer_Name, Potassium_100g, Type) %>%
group_by(Manufacturer_Name) %>%
summarise(Average = round(mean(Potassium_100g, na.rm = TRUE), 1),
Median = round(median(Potassium_100g, na.rm = TRUE), 1),
Lowest = min(Potassium_100g, na.rm = TRUE),
Highest = max(Potassium_100g, na.rm = TRUE),
Count = n())## # A tibble: 7 x 6
## Manufacturer_Name Average Median Lowest Highest Count
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 American Home Food Products 335. 335. 335. 335. 1
## 2 General Mills 329. 282. 88.2 1217 22
## 3 Kellogs 408. 212. 70.5 1164 23
## 4 Nabisco 500. 423. 278. 988. 6
## 5 Post 455 318. 88.2 1220. 9
## 6 Quaker Oats 248 247. 26.5 476. 8
## 7 Ralston Purina 360. 406. 88.2 600. 8
data_cereal %>%
ggplot(aes(x = Manufacturer_Name, y = Potassium_100g, fill = Manufacturer_Name)) +
geom_boxplot(show.legend = FALSE) +
stat_summary(fun.y = mean, geom = "point", pch = 1, show.legend = FALSE) + # Add average to the boxplot
scale_y_continuous(name = "Potassium (mg per 100g)", minor_breaks = NULL) +
scale_fill_brewer(palette = "Set1") +
coord_flip() +
theme_minimal() +
labs(x = "Manufacturer") +
ggtitle(label = "Distribution of Potassium Content by Manufacturer")data_cereal %>%
ggplot(aes(x = Potassium_100g)) +
geom_density(fill = "grey", alpha = 0.8, linetype = 0) +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Potassium (mg per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Proportion", expand = c(0,0)) +
labs(fill = "Manufacturer", title = "Distribution of Potassium in Breakfast Cereals") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Potassium_100g, fill = Manufacturer_Name)) +
geom_histogram(alpha = 0.8, bins = 8) +
scale_fill_brewer(palette = "Set1") +
scale_x_continuous(name = "Potassium (mg per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 40), breaks = seq(0, 40, 10)) +
labs(fill = "Manufacturer", title = "Distribution of Potassium in Breakfast Cereals") +
theme_minimal()data_cereal %>%
ggplot(aes(x = Potassium_100g, fill = Type)) +
geom_histogram(alpha = 0.8, bins = 8) +
scale_fill_brewer(palette = "Set1", direction = -1) +
scale_x_continuous(name = "Potassium (mg per 100g)", expand = c(0,0)) +
scale_y_continuous(name = "Count", expand = c(0,0), limits = c(0, 40), breaks = seq(0, 40, 10)) +
labs(fill = "Type", title = "Distribution of Potassium in Breakfast Cereals") +
theme_minimal()# install.packages("factoextra")
library(factoextra)
# Create subset for PCA
PCA_data <- data_cereal %>%
select(Name, Manufacturer_Name, Calories = Calories_100g_calculated, Protein = Protein_100g, Fat = Fat_100g, Sodium = Sodium_100g, Fibre = Fibre_100g, Carbohydrates = Carbohydrates_100g, Sugar = Sugar_100g, Potassium = Potassium_100g, Rating)
# Remove observations with NAs
PCA_data <- PCA_data[complete.cases(PCA_data),]
PCA_cereals <- prcomp(PCA_data[, 3:11], scale. = TRUE)
# Obtain Summary of PCA
summary(PCA_cereals)## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.7585 1.5749 1.2921 0.94776 0.70543 0.55139 0.21930
## Proportion of Variance 0.3436 0.2756 0.1855 0.09981 0.05529 0.03378 0.00534
## Cumulative Proportion 0.3436 0.6192 0.8047 0.90448 0.95977 0.99356 0.99890
## PC8 PC9
## Standard deviation 0.09954 1.821e-16
## Proportion of Variance 0.00110 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00
## Warning: package 'ggpubr' was built under R version 3.6.2
# PCA Variables
fviz_pca_var(PCA_cereals,
col.var = "contrib",
repel = TRUE,
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
title = "Principal Component Analysis: Variable Contribution",
legend.title = "Contribution"
)# PCA Biplot: Variables and Individuals
fviz_pca_biplot(PCA_cereals,
geom.ind = "point",
pointshape = 21,
pointsize = 3,
fill.ind = PCA_data$Manufacturer_Name,
# col.ind = "Black",
alpha = 0.8,
mean.point = FALSE,
col.var = factor(c("Input", "Input", "Input", "Input", "Input", "Input", "Input", "Input", "Output")), # Colour inputs and outputs differently
repel = TRUE,
legend.title = list(fill = "Manufacturer", color = "Parameters"),
title = "Principal Component Analysis") +
fill_palette("Set1") + # Palette for individuals
color_palette(palette = "aaas") # Palette for variablesWe need to select the number of clusters that has the optimal value of within sum of squares error (WSS).
Once we have a scree plot, we will select the number of clusters where the WSS improves more slowly as the number of clusters increases.
# Create subset of data for k-means clustering
kmeans_data <- data_cereal %>%
select(Name, Manufacturer_Name, Calories = Calories_100g_calculated, Protein = Protein_100g, Fat = Fat_100g, Sodium = Sodium_100g, Fibre = Fibre_100g, Carbohydrates = Carbohydrates_100g, Sugar = Sugar_100g, Potassium = Potassium_100g, Rating)
# Remove rows with NAs
kmeans_data <- kmeans_data[complete.cases(kmeans_data), ]
# Create a starting vector to add WSS values
wss <- 0
# Loop k-means algorithm for various numbers of clusters
for (i in 1:10) {
kmeans.output <- kmeans(x = kmeans_data[, 3:11], centers = i, nstart = 20)
# Save total within sum of squares to wss vector
wss[i] <- kmeans.output$tot.withinss
}
# Plot total within sum of squares vs. number of clusters
data.frame(Clusters = 1:10, WSS = wss) %>%
ggplot(aes(x = Clusters, y = WSS)) +
geom_point() +
geom_line() +
scale_x_continuous(name = "Number of Clusters", breaks = 1:10, minor_breaks = NULL) +
scale_y_continuous(name = "Within Groups Sum of Squares") +
theme_minimal() +
labs(title = "Scree Plot")From the scree plot above, 3 or 4 seem to be the optimal number of clusters.
## K-means clustering with 3 clusters of sizes 15, 48, 11
##
## Cluster means:
## Calories Protein Fat Sodium Fibre Carbohydrates Sugar
## 1 242.8400 8.286667 3.053333 95.2400 6.593333 45.55333 20.22000
## 2 291.5437 8.514583 3.777083 710.0917 4.752083 55.87292 25.71667
## 3 322.9182 14.027273 4.590909 836.0273 25.663636 56.37273 41.31818
## Potassium Rating
## 1 298.9667 52.42421
## 2 259.5208 37.82648
## 3 976.6545 48.49800
##
## Clustering vector:
## [1] 3 1 3 3 2 2 2 2 3 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 3 3 2 1 2 2 2 1 2 2 2 2 2
## [39] 2 2 2 1 1 2 3 2 2 3 2 2 3 2 1 1 2 3 2 1 2 2 1 1 1 1 2 1 2 3 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 764161.6 2725525.3 1130535.5
## (between_SS / total_SS = 67.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Use clusplot function to plot cluster
library(factoextra)
kmeans.output <- eclust(x = kmeans_data[, 3:11], FUNcluster = "kmeans", k = 3, graph = FALSE)
kmeans.output## K-means clustering with 3 clusters of sizes 27, 11, 36
##
## Cluster means:
## Calories Protein Fat Sodium Fibre Carbohydrates Sugar
## 1 250.9519 8.788889 4.048148 269.7704 6.607407 44.84074 25.73704
## 2 322.9182 14.027273 4.590909 836.0273 25.663636 56.37273 41.31818
## 3 301.6944 8.213889 3.272222 784.1444 4.127778 59.84722 23.41111
## Potassium Rating
## 1 309.1481 46.12256
## 2 976.6545 48.49800
## 3 238.7361 37.68681
##
## Clustering vector:
## [1] 2 1 2 2 3 1 3 3 2 3 3 3 1 3 3 3 1 3 1 3 1 3 1 3 1 2 2 1 1 3 1 3 1 3 3 3 3 3
## [39] 3 1 3 1 1 1 2 3 3 2 3 3 2 3 1 1 1 2 1 1 3 3 1 1 1 1 3 1 3 2 3 3 1 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 2226440 1130535 1429583
## (between_SS / total_SS = 66.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault" "silinfo"
## [11] "nbclust" "data"
fviz_cluster(kmeans.output, ellipse = TRUE, ellipse.type = "norm", ellipse.level = 0.95, ellipse.alpha = 0.1) +
theme_bw()